#!/home/chunwei/chunenv/bin/python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import pandas as pd
import numpy as np
import argparse
from scipy.optimize import minimize
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn import metrics
import os
import xgboost as xgb

# read trainset
def parse_args():
    if len(sys.argv) == 1:
        sys.argv.append('-h')

    parser = argparse.ArgumentParser()
    parser.add_argument('niter')
    parser.add_argument('max_depth')
    parser.add_argument('cv')
    parser.add_argument('train_path')
    #parser.add_argument('valid_path')
    parser.add_argument('test_path')
    args = vars(parser.parse_args())
    return args

args = parse_args()
niter = int(args['niter'])
max_depth = int(args['max_depth'])
cv = int(args['cv']) > 0
train_path = args['train_path']
#valid_path = args['valid_path']
test_path = args['test_path']
train_model_path = train_path + ".gbdt.model"
# 输出叶子index作为新的特征
train_leaf_path = train_path + ".gbdt.leaf.index"
#valid_leaf_path = valid_path + ".gbdt.leaf.index"
test_leaf_path = test_path + ".gbdt.leaf.index"
test_pred_path = test_path + ".gbdt.pred"

#train_path = './feature/tmp/train2.parts.0.train.svm'
#valid_path = './feature/tmp/train2.parts.0.valid.svm'
#test_path = './feature/tmp/test2.train.fm'

dtrain = xgb.DMatrix(train_path)
#dvalid = xgb.DMatrix(valid_path)
dtest = xgb.DMatrix(test_path)

train_labels = dtrain.get_label()

scale_pos_weight = (len(train_labels) - np.sum(train_labels)) / np.sum(train_labels)
print 'scale_pos_weight', scale_pos_weight

param = {'bst:max_depth':max_depth, 'bst:eta':0.1, 'silent':1, 'objective':'binary:logistic' }
param['scale_pos_weight'] = scale_pos_weight
param['nthread'] = 15
#param['nthread'] = 20
plst = param.items()
plst += [('eval_metric', 'auc')] # Multiple evals can be handled in this way
#plst += [('eval_metric', 'ams@0')]

#evallist  = [(dvalid,'eval'), (dtrain,'train')]
evallist  = [ (dtrain,'train')]

#num_round = 140


if cv:
    print 'cross validate ...'
    bst = xgb.cv(param, dtrain, niter + 30, nfold=5,
           metrics={'auc'}, seed = 0)

bst = xgb.train( plst, dtrain, niter, evallist, early_stopping_rounds=7)

#bst.save_model('gbdb.1.model')

'''
# output train leaf
print 'write leaf to ', train_leaf_path
with open(train_leaf_path, 'w') as f:
    leafs = bst.predict(data=dtrain, pred_leaf=True)
    for rcd in leafs:
        line = ' '.join(str(i) for i in rcd)
        f.write(line + '\n')

# output valid leaf
print 'write leaf to ', valid_leaf_path
with open(valid_leaf_path, 'w') as f:
    leafs = bst.predict(data=dvalid, pred_leaf=True)
    for rcd in leafs:
        line = ' '.join(str(i) for i in rcd)
        f.write(line + '\n')

# output test leaf
print 'write leaf to ', test_leaf_path
with open(test_leaf_path, 'w') as f:
    leafs = bst.predict(data=dtest, pred_leaf=True)
    for rcd in leafs:
        line = ' '.join(str(i) for i in rcd)
        f.write(line + '\n')
'''

# output test predictions
print 'write predictions to ', test_pred_path
with open(test_pred_path, 'w') as f:
    preds = bst.predict(data=dtest)
    f.write(
        '\n'.join( str(i) for i in preds))

print 'save model to ', train_model_path
bst.save_model(train_model_path)
